1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)
library(mlbench)
library(psych)
library(whitening)
library("vioplot")
library("rpart")

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

mlBench library

Newman, D.J. & Hettich, S. & Blake, C.L. & Merz, C.J. (1998). UCI Repository of machine learning databases [http://www.ics.uci.edu/~mlearn/MLRepository.html]. Irvine, CA: University of California, Department of Information and Computer Science.

1.2 The Data

data(PimaIndiansDiabetes)
pander::pander(table(PimaIndiansDiabetes$diabetes))
neg pos
500 268
PimaIndiansDiabetes$diabetes <- 1*(PimaIndiansDiabetes$diabetes=="pos")

1.2.0.1 Standarize the names for the reporting

studyName <- "Diabetes"
dataframe <- PimaIndiansDiabetes
outcome <- "diabetes"

thro <- 0.25
TopVariables <- 3
cexheat = 0.45

1.3 Generaring the report

1.3.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")
library("rpart")

1.3.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
768 8
pander::pander(table(dataframe[,outcome]))
0 1
500 268

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1500 

1.3.3 Scaling the data

Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]
  
  iscontinous <- sapply(apply(dataframe,2,unique),length) > 5 ## Only variables with enough samples



dataframeScaled <- FRESAScale(dataframe,method="OrderLogit")$scaledData

1.4 The heatmap of the data

numsub <- nrow(dataframe)
if (numsub > 1000) numsub <- 1000


if (!largeSet)
{

  hm <- heatMaps(data=dataframeScaled[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 xlab="Feature",
                 ylab="Sample",
                 srtCol=45,
                 srtRow=45,
                 cexCol=cexheat,
                 cexRow=cexheat
                 )
  par(op)
}

1.4.0.1 Correlation Matrix of the Data

The heat map of the data


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  #cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
  cormat <- cor(dataframe[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.5443412

1.5 The decorrelation


DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#> 
#>  Included: 8 , Uni p: 0.01082532 , Uncorrelated Base: 4 , Outcome-Driven Size: 0 , Base Size: 4 
#> 
#> 
 1 <R=0.544,r=0.397,N=    4>, Top: 2( 1 )[ 1 : 2 Fa= 2 : 0.397 ]( 2 , 2 , 0 ),<|>Tot Used: 4 , Added: 2 , Zero Std: 0 , Max Cor: 0.393
#> 
 2 <R=0.393,r=0.321,N=    4>, Top: 2( 1 )[ 1 : 2 Fa= 3 : 0.321 ]( 2 , 2 , 2 ),<|>Tot Used: 6 , Added: 2 , Zero Std: 0 , Max Cor: 0.230
#> 
 3 <R=0.230,r=0.250,N=    0>
#> 
 [ 3 ], 0.2301263 Decor Dimension: 6 Nused: 6 . Cor to Base: 3 , ABase: 2 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

15144

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

11314

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

3.24

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

3.62

1.5.1 The decorrelation matrix


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  
  UPSTM <- attr(DEdataframe,"UPSTM")
  
  gplots::heatmap.2(1.0*(abs(UPSTM)>0),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
  par(op)
}

1.6 The heatmap of the decorrelated data

if (!largeSet)
{

  hm <- heatMaps(data=DEdataframe[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 cexRow = cexheat,
                 cexCol = cexheat,
                 srtCol=45,
                 srtRow=45,
                 xlab="Feature",
                 ylab="Sample")
  par(op)
}

1.7 The correlation matrix after decorrelation

if (!largeSet)
{

  cormat <- cor(DEdataframe[,varlistc],method="pearson")
  cormat[is.na(cormat)] <- 0
  
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Correlation after IDeA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  
  par(op)
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.2301263

1.8 U-MAP Visualization of features

1.8.1 The UMAP based on LASSO on Raw Data


if (nrow(dataframe) < 1000)
{
  classes <- unique(dataframe[1:numsub,outcome])
  raincolors <- rainbow(length(classes))
  names(raincolors) <- classes
  datasetframe.umap = umap(scale(dataframe[1:numsub,varlist]),n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
  text(datasetframe.umap$layout,labels=dataframe[1:numsub,outcome],col=raincolors[dataframe[1:numsub,outcome]+1])
}

1.8.2 The decorralted UMAP

if (nrow(dataframe) < 1000)
{

  datasetframe.umap = umap(scale(DEdataframe[1:numsub,varlistc]),n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After IDeA",t='n')
  text(datasetframe.umap$layout,labels=DEdataframe[1:numsub,outcome],col=raincolors[DEdataframe[1:numsub,outcome]+1])
}

1.9 Univariate Analysis

1.9.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")



univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

1.9.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##topfive
topvar <- c(1:length(varlist)) <= TopVariables
pander::pander(univarRAW$orderframe[topvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
glucose 141.3 31.94 110.0 26.14 0.0472 0.788
mass 35.1 7.26 30.3 7.69 0.0408 0.688
age 37.1 10.97 31.2 11.67 0.0000 0.687


topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]

theLaVar <- rownames(finalTable)[str_detect(rownames(finalTable),"La_")]

pander::pander(univarDe$orderframe[topLAvar,univariate_columns])
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
glucose 141.26 31.94 110.0 26.14 4.72e-02 0.788
La_mass 30.84 7.00 26.5 6.93 2.12e-02 0.686
pregnant 4.87 3.74 3.3 3.02 1.11e-16 0.620

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")

theSigDc <- dc[theLaVar]
names(theSigDc) <- NULL
theSigDc <- unique(names(unlist(theSigDc)))


theFormulas <- dc[rownames(finalTable)]
deFromula <- character(length(theFormulas))
names(deFromula) <- rownames(finalTable)

pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
2.33 3 0.375


allSigvars <- names(dc)



dx <- names(deFromula)[1]
for (dx in names(deFromula))
{
  coef <- theFormulas[[dx]]
  cname <- names(theFormulas[[dx]])
  names(cname) <- cname
  for (cf in names(coef))
  {
    if (cf != dx)
    {
      if (coef[cf]>0)
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("+ %5.3f*%s",coef[cf],cname[cf]))
      }
      else
      {
        deFromula[dx] <- paste(deFromula[dx],
                               sprintf("%5.3f*%s",coef[cf],cname[cf]))
      }
    }
  }
}

finalTable <- rbind(finalTable,univarRAW$orderframe[theSigDc[!(theSigDc %in% rownames(finalTable))],univariate_columns])


orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- deFromula[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")

finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores
glucose 141.26 31.94 110.0 26.14 4.72e-02 0.788 0.788 1
mass NA 35.14 7.26 30.3 7.69 4.08e-02 0.688 0.688 NA
La_mass -0.194triceps + 1.000mass 30.84 7.00 26.5 6.93 2.12e-02 0.686 0.688 -1
pregnant 4.87 3.74 3.3 3.02 1.11e-16 0.620 0.620 1
triceps NA 22.16 17.68 19.7 14.89 3.11e-15 0.554 0.554 2

1.10 Comparing IDeA vs PCA vs EFA

1.10.1 PCA

featuresnames <- colnames(dataframe)[colnames(dataframe) != outcome]
pc <- prcomp(dataframe[,iscontinous],center = TRUE,tol=0.002)   #principal components
predPCA <- predict(pc,dataframe[,iscontinous])
PCAdataframe <- as.data.frame(cbind(predPCA,dataframe[,!iscontinous]))
colnames(PCAdataframe) <- c(colnames(predPCA),colnames(dataframe)[!iscontinous]) 
#plot(PCAdataframe[,colnames(PCAdataframe)!=outcome],col=dataframe[,outcome],cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)

#pander::pander(pc$rotation)


PCACor <- cor(PCAdataframe[,colnames(PCAdataframe) != outcome])


  gplots::heatmap.2(abs(PCACor),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "PCA Correlation",
                    cexRow = 0.5,
                    cexCol = 0.5,
                     srtCol=45,
                     srtRow= -45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")

1.10.2 EFA


EFAdataframe <- dataframeScaled

if (length(iscontinous) < 2000)
{
  topred <- min(length(iscontinous),nrow(dataframeScaled),ncol(predPCA)/2)
  if (topred < 2) topred <- 2
  
  uls <- fa(dataframeScaled[,iscontinous],nfactors=topred,rotate="varimax",warnings=FALSE)  # EFA analysis
  predEFA <- predict(uls,dataframeScaled[,iscontinous])
  EFAdataframe <- as.data.frame(cbind(predEFA,dataframeScaled[,!iscontinous]))
  colnames(EFAdataframe) <- c(colnames(predEFA),colnames(dataframeScaled)[!iscontinous]) 


  
  EFACor <- cor(EFAdataframe[,colnames(EFAdataframe) != outcome])
  
  
    gplots::heatmap.2(abs(EFACor),
                      trace = "none",
    #                  scale = "row",
                      mar = c(5,5),
                      col=rev(heat.colors(5)),
                      main = "EFA Correlation",
                      cexRow = 0.5,
                      cexCol = 0.5,
                       srtCol=45,
                       srtRow= -45,
                      key.title=NA,
                      key.xlab="Pearson Correlation",
                      xlab="Feature", ylab="Feature")
}

1.11 Effect on CAR modeling

par(op)
par(xpd = TRUE)
dataframe[,outcome] <- factor(dataframe[,outcome])
rawmodel <- rpart(paste(outcome,"~."),dataframe,control=rpart.control(maxdepth=3))
pr <- predict(rawmodel,dataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(rawmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,dataframe[,outcome]==0))
  }


pander::pander(table(dataframe[,outcome],pr))
  0 1
0 443 57
1 118 150
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.270 0.2384 0.302
    tp 0.349 0.3152 0.384
    se 0.560 0.4980 0.620
    sp 0.886 0.8548 0.913
    diag.ac 0.772 0.7408 0.801
    diag.or 9.880 6.8489 14.251
    nndx 2.244 1.8777 2.834
    youden 0.446 0.3529 0.533
    pv.pos 0.725 0.6584 0.784
    pv.neg 0.790 0.7536 0.823
    lr.pos 4.910 3.7613 6.409
    lr.neg 0.497 0.4326 0.571
    p.rout 0.730 0.6976 0.762
    p.rin 0.270 0.2384 0.302
    p.tpdn 0.114 0.0875 0.145
    p.tndp 0.440 0.3800 0.502
    p.dntp 0.275 0.2157 0.342
    p.dptn 0.210 0.1773 0.246
  • tab:

      Outcome + Outcome - Total
    Test + 150 57 207
    Test - 118 443 561
    Total 268 500 768
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.772 0.741 0.801
3 se 0.560 0.498 0.620
4 sp 0.886 0.855 0.913
6 diag.or 9.880 6.849 14.251

par(op)
par(xpd = TRUE)
DEdataframe[,outcome] <- factor(DEdataframe[,outcome])
IDeAmodel <- rpart(paste(outcome,"~."),DEdataframe,control=rpart.control(maxdepth=3))
pr <- predict(IDeAmodel,DEdataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(IDeAmodel,main="IDeA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(IDeAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,DEdataframe[,outcome]==0))
  }

pander::pander(table(DEdataframe[,outcome],pr))
  0 1
0 416 84
1 98 170
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.331 0.298 0.365
    tp 0.349 0.315 0.384
    se 0.634 0.574 0.692
    sp 0.832 0.796 0.864
    diag.ac 0.763 0.731 0.793
    diag.or 8.591 6.104 12.090
    nndx 2.144 1.799 2.704
    youden 0.466 0.370 0.556
    pv.pos 0.669 0.608 0.727
    pv.neg 0.809 0.773 0.842
    lr.pos 3.776 3.045 4.682
    lr.neg 0.440 0.374 0.517
    p.rout 0.669 0.635 0.702
    p.rin 0.331 0.298 0.365
    p.tpdn 0.168 0.136 0.204
    p.tndp 0.366 0.308 0.426
    p.dntp 0.331 0.273 0.392
    p.dptn 0.191 0.158 0.227
  • tab:

      Outcome + Outcome - Total
    Test + 170 84 254
    Test - 98 416 514
    Total 268 500 768
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.763 0.731 0.793
3 se 0.634 0.574 0.692
4 sp 0.832 0.796 0.864
6 diag.or 8.591 6.104 12.090

par(op)
par(xpd = TRUE)
PCAdataframe[,outcome] <- factor(PCAdataframe[,outcome])
PCAmodel <- rpart(paste(outcome,"~."),PCAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(PCAmodel,PCAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
  plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
  text(PCAmodel, use.n = TRUE,cex=0.75)
  ptab <- epiR::epi.tests(table(pr==0,PCAdataframe[,outcome]==0))
}

pander::pander(table(PCAdataframe[,outcome],pr))
  0 1
0 421 79
1 88 180
pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.337 0.304 0.372
    tp 0.349 0.315 0.384
    se 0.672 0.612 0.728
    sp 0.842 0.807 0.873
    diag.ac 0.783 0.752 0.811
    diag.or 10.900 7.679 15.474
    nndx 1.947 1.665 2.387
    youden 0.514 0.419 0.600
    pv.pos 0.695 0.635 0.750
    pv.neg 0.827 0.791 0.859
    lr.pos 4.251 3.415 5.292
    lr.neg 0.390 0.327 0.465
    p.rout 0.663 0.628 0.696
    p.rin 0.337 0.304 0.372
    p.tpdn 0.158 0.127 0.193
    p.tndp 0.328 0.272 0.388
    p.dntp 0.305 0.250 0.365
    p.dptn 0.173 0.141 0.209
  • tab:

      Outcome + Outcome - Total
    Test + 180 79 259
    Test - 88 421 509
    Total 268 500 768
  • method: exact

  • digits: 2

  • conf.level: 0.95

pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.783 0.752 0.811
3 se 0.672 0.612 0.728
4 sp 0.842 0.807 0.873
6 diag.or 10.900 7.679 15.474


par(op)

1.11.1 EFA


  EFAdataframe[,outcome] <- factor(EFAdataframe[,outcome])
  EFAmodel <- rpart(paste(outcome,"~."),EFAdataframe,control=rpart.control(maxdepth=3))
  pr <- predict(EFAmodel,EFAdataframe,type = "class")
  
  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(EFAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,EFAdataframe[,outcome]==0))
  }


  pander::pander(table(EFAdataframe[,outcome],pr))
  0 1
0 476 24
1 168 100
  pander::pander(ptab)
  • detail:

    statistic est lower upper
    ap 0.161 0.136 0.1894
    tp 0.349 0.315 0.3838
    se 0.373 0.315 0.4341
    sp 0.952 0.929 0.9690
    diag.ac 0.750 0.718 0.7803
    diag.or 11.806 7.313 19.0590
    nndx 3.076 2.481 4.0904
    youden 0.325 0.244 0.4031
    pv.pos 0.806 0.726 0.8719
    pv.neg 0.739 0.703 0.7727
    lr.pos 7.774 5.107 11.8320
    lr.neg 0.658 0.599 0.7237
    p.rout 0.839 0.811 0.8639
    p.rin 0.161 0.136 0.1894
    p.tpdn 0.048 0.031 0.0706
    p.tndp 0.627 0.566 0.6849
    p.dntp 0.194 0.128 0.2742
    p.dptn 0.261 0.227 0.2966
  • tab:

      Outcome + Outcome - Total
    Test + 100 24 124
    Test - 168 476 644
    Total 268 500 768
  • method: exact

  • digits: 2

  • conf.level: 0.95

  pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.750 0.718 0.780
3 se 0.373 0.315 0.434
4 sp 0.952 0.929 0.969
6 diag.or 11.806 7.313 19.059
  par(op)